*******************************************************************************************************************************************
* Stata do file to replicate results in 'The impact of Covid-19 on productivitiy', published in Review of Economics and Statistics, 2022  *
* Nicholas Bloom, Philip Bunn, Paul Mizen, Pawel Smietanka, Gregory Thwaites															  *
*******************************************************************************************************************************************

* First define working directory where all raw datasets are stored
cd "H:\DataPrivilege Secured Datasets\Decision Making Project\Covid and productivity paper\Final paper October 2022\Replication files"
*cd "C:\Replication files"

* Open log file
log using ".\Stata log file for impact of Covid on productivity.txt", text replace


* Structure of this do file is as follows:
* First import various aggregate data files into Stata to merge later on
* Then prepare BVD company accounts data
* Next, calculate within firm Covid productivity impacts from the DMP data
* Then merge all the different datasets together
* Once all data is set up, the analysis is carried out 
* The final section is on approximately replicating results without BVD data


* The following datasets are used in this do file
* Aggregate data - all available in replication files
	* ONS data on importance of intermediate consumption by industry - from ONS input output tables.xlsx
	* ONS data on Business Register employment shares for 2019 - from 2019 business population estimates.xlsx 
	* ONS data on GDP deflator - from ONS data downloads.xlsx
	* ONS data on industry level factor shares - from UK TFP data.xlsx
	* Data on teleworkability by industry from Dingel and Neiman (2020) - from Teleworking by industry.xlsx 
	* BVD data on productivity by industry  - from BVD pre Covid productivity by industry.xlsx
	* Coefficients for productivity imputation - from Coefficients for productivity imputation.xlsx
* BVD company accounts data - proprietary data, not available in the replication files because our licence does not allow us to share the microdata
* Decision Maker Panel microdata - proprietary data, data not posted as part of replication files but anonymised data are available via the ONS Secure Research Service (see main README file for more details on how to access)


* Because BVD company accounts data are not available, not all results in the paper can be replicated using the DMP data in the SRS
* However, the replication files do include details of how to produce a close approximation of the the main results using the DMP microdata but without using BVD data
* Here, the main input from BVD, the level of pre-Covid productivity at the firm leve, is imputed using industry data (which is included in the replication files) and other variables in the DMP data
* See final section of the do file for more details on this


***************************
* Set aggregate datasets  *
***************************

* Load ONS data into Stata on importance of intermediate consumption by industry
import excel ".\ONS input output tables.xlsx", sheet("Stata") first clear
label var y_gva "Industry share of GVA in total output"
label var m_y "Industry share of intermediate consumption in GVA"
save ".\ONS industry cost shares.dta", replace

* Load ONS 2019 Business Register employment shares to use in weighting
* Just industry weights
import excel ".\2019 business population estimates.xlsx", sheet("Stata1") first clear
save ".\2019 Business register weight shares industry only.dta", replace
* Industry and size groups weights
import excel ".\2019 business population estimates.xlsx", sheet("Stata2") first clear
reshape long wgrp_share, i(size2) j(ind2)
egen wgrp=group(ind2 size2)              //Group for weighting//
keep wgrp wgrp_share
save ".\2019 Business register weight shares with size and industry.dta", replace

* ONS GDP deflator data
import excel ".\ONS data downloads.xlsx", sheet("GDP deflator for Stata") first clear
destring year, replace
replace gdpdef=0.01*gdpdef
label var gdpdef "GDP deflator"
save ".\GDP deflator.dta", replace

* ONS data on industry level factor shares
import excel ".\UK TFP data.xlsx", sheet("Stata") first clear
label var lab_weight "Industry level labour weight in TFP"
label var lab_weight_all "Aggregate labour weight in TFP"
save ".\ONS industry level factor shares.dta", replace

* Data on teleworking by industry
import excel ".\Teleworking by industry.xlsx", sheet("Stata") first clear
label var tele "Percentage of jobs that can by done from home (industry level)"
save ".\Teleworking by industry.dta", replace

* BVD data on pre-Covid productivity by industry and firm size for approximate replication of main results
import excel ".\BVD pre Covid productivity by industry.xlsx", sheet("Stata") first clear
label var prod_last "Industry labour productivity from BVD (Pre-Covid)"
label var tfp2_last "Industry TFP using aggregate factor shares from BVD (Pre-Covid)"
save ".\BVD pre Covid productivity by industry.dta", replace

* Coefficients for productivity imputation
import excel ".\Coefficients for productivity imputation.xlsx", sheet("Sheet1") first clear
save ".\Coefficients for productivity imputation.dta", replace



***********************
* Set up BVD dataset  *
***********************

* Load in downloaded BVD data - Last downloaded in June 2022
* This is data for the full DMP sampling frame, approximatley 70,000 firms
* Data are for calendar years
* Note this file is not available in the replication files because our licence does not allow us to share the microdata
use ".\BVD accounts data.dta", clear   

* Keep time period of interest
keep if year>2015 & year<2022

* Keep variables required for Covid and productivity analysis
keep regnum year accmonth lastaccounts sic status incdate totassets sales oprofit labcost wages nemp fixedassets sample
* These variables are as downloaded from BVD, variable labels tell you what the data are
label var regnum "Companies House registered number"
label var year "Accounts year"
label var accmonth "Accounting reference month"
label var lastaccounts "Date of last accounts"
label var sic "SIC industry code"
label var status "Company status"
label var incdate "Incorporation date"
label var sales "Turnover"
label var oprofit "Operating profit"
label var labcost "Remuneration"
label var wages "Wages and Salaries"
label var nemp "Number of employees"
label var fixedassets "Fixed assets"
* Sample is an extra variable for when a firm was added to the sampling frame, added manually
label var sample "When added to DMP sampling frame"
tab sample

* Dummy for having accounts - define having accounts as having total assets, sales or employment
gen hasacc=0
replace hasacc=1 if totassets~=. & nemp~=. & sales~=.
label var hasacc "Dummy for having accounts data"

* Generate year of incorporation
gen incyear=year(incdate)
label var incyear "Incorporation year"
drop incdate

* Define firm age
gen age=year-incyear
label var age "Firm age"

* Clean out any negative sales values
replace sales=. if sales<0

* Create industry groups
egen ind2=cut(sic), at(0,5000,10000,35000,41000,45000,49000,55000,58000,64000,68000,69000,77000,84000,85000,86000,90000,94000,100000)
replace ind2=ind2/1000
recode ind2 (0=2) (5=2) (10=1) (35=2) (41=3) (45=4) (49=5) (55=6) (58=7) (64=8) (68=9) (69=10) (77=11) (84=14) (85=14) (86=12) (90=13) (94=14)
*label define ind2 1 "Manufacturing" 2 "Other Production" 3 "Construction" 4 "Wholesale & Retail" 5 "Transport & Storage" 6 "Accom & Food" 7 "Info & Comms" 8 "Finance & Insurance" 9 "Real Estate" 10 "Prof & Scientific" 11 "Admin & Support" 12 "Health" 13 "Recreational Services" 14 "Other Services"
label values ind2 ind2
label var ind2 "Industry group"

* Merge in GDP deflator data
merge m:m year using ".\GDP deflator.dta", nogen

* Merge in industry level factor shares data
merge m:m ind2 using ".\ONS industry level factor shares.dta", nogen

* Generate real GVA - don't allow negative values
gen gva_real=(oprofit + labcost)/gdpdef 
replace gva_real=. if gva_real<0
label var gva_real "Real GVA"

* Generate labour productivity - don't allow negative values
gen prod = gva_real/nemp if (gva_real/nemp)>0
gen lnprod=ln(prod)
* Productivity growth and then clean at 1st and 99th percentiles
egen id=group(regnum)
label var id "Numeric firm id"
xtset id year
* Productivity growth - log growth rate
gen dlnprod=d.lnprod
label var prod "Labour productivity"
label var lnprod "Log of labour productivity"
label var dlnprod "Labour productivity growth"

* Generate TFP
* For this paper use TFP without normalising at industry level because need levels differences between industries to calculation reallocation effects
* Using aggregate factor shares for weight of capital and labour
xtset id year
gen lntfp2=ln(gva_real)-lab_weight_all*ln(labcost/gdpdef)-(1-lab_weight_all)*ln(fixedassets/gdpdef)
gen dlntfp2=d.lntfp2
* Using industry level factor shares
gen lntfp3=ln(gva_real)-lab_weight*ln(labcost/gdpdef)-(1-lab_weight)*ln(fixedassets/gdpdef)
* Calcualte absolute tfp levels
gen tfp2=exp(lntfp2)
gen tfp3=exp(lntfp3)
label var lntfp2 "Log of TFP using aggregate factor shares"
label var lntfp3 "Log of TFP using industry factor shares"
label var dlntfp "TFP growth using aggregate factor shares"
label var tfp2 "TFP using aggregate factor shares"
label var tfp3 "TFP using industry factor shares"

* Average wage per employee
gen wage=wages/nemp
label var wage "Average wage per employee"
* Log wage
gen lnwage=ln(wage)
label var lnwage "Log of average wage per employee"


* Set up pre-pandemic variables from accounts data to merge with DMP
preserve
 * Keep variables of interest
keep regnum id ind2 year nemp lnprod prod tfp2 lntfp2 tfp3 lntfp3 lnwage lab_weight sample sic
* Reshape to help calculate averages
reshape wide nemp lnprod prod tfp2 lntfp2 tfp3 lntfp3 lnwage ,  i(regnum) j(year) 
* Last observation prior to pandemic for key variables 
foreach i in nemp lnprod prod tfp2 lntfp2 tfp3 lntfp3 lnwage { 
gen `i'_last=`i'2019
replace `i'_last=`i'2018 if `i'_last==.	
replace `i'_last=`i'2017 if `i'_last==.	
}
keep regnum ind2 *_last prod2019 tfp22019 lab_weight sample sic
rename prod2019 prod_2019
rename tfp22019 tfp2_2019
label var nemp_last "Number of employees (Pre-Covid)"
label var prod_last "Labour productivity (Pre-Covid)"
label var prod_2019 "Labour productivity in 2019"
label var lnprod_last "Log of labour productivity (Pre-Covid)"
label var lnwage_last "Log of average wage per employee (Pre-Covid)"
label var lntfp2_last "Log of TFP using aggregate factor shares (Pre-Covid)"
label var lntfp3_last "Log of TFP using industry factor shares (Pre-Covid)"
label var tfp2_last "TFP using aggregate factor shares (Pre-Covid)"
label var tfp2_2019 "TFP using aggregate factor shares in 2019"
label var tfp3_last "TFP using industry factor shares (Pre-Covid)"
* Winsorise data
winsor2 *_last prod_2019 tfp2_2019, replace cuts(5 95) by(ind2)
save ".\BVD data to merge for Covid and productivity analysis.dta", replace
restore


* Set up data for firm entry/exit analysis

* Winsorise data
winsor2 prod lnprod tfp2 tfp3 lntfp2 lntfp3 dlnprod dlntfp2 lnwage, replace cuts(5 95) by(ind2 year)

* Define new firm - use first year of reported accounts, incorporation date 2016 onwards and accounts within 2 years of incorporation date
gen new=0 if hasacc==1
bysort id: egen yr1=min(year) if hasacc==1 
replace new=1 if year==yr1 & incyear>2015 & incyear~=. & (age>=0 & age<3)  //Allow first year of accounts to be in year of incorporation
tab age if new==1
xtset id year
gen yr2=0 if hasacc==1
replace yr2=1 if l.new==1
gen yr3=0 if hasacc==1
replace yr3=1 if l2.new==1
label var new "Dummy for new firm 1st year of accounts"
label var yr2 "Dummy for new firm 2nd year of accounts"
label var yr3 "Dummy for new firm 3rd year of accounts"

* Define failing firms and last year before failure
gen lastaccyr=year(lastaccounts)
gen fail=0 if status~=""
replace fail=1 if status=="Dissolved" & year==lastaccyr
replace fail=1 if status=="In liquidation" & year==lastaccyr
label var fail "Dummy for firm in its last year before failing"
drop yr1 lastaccyr

* Dummy for failing during Covid period - reference period is failing 2017-19 (add one year here given talking about about last accounts), Covid period is 2020-2022, so more scope for firms to fail yet
gen cov_fail=0 if year==2016 | year==2017 | year==2018
replace cov_fail=1 if year==2019 | year==2020 | year==2021
label var cov_fail "Covid period dummy for failure analysis"

* Save BVD data to use in analysis of productivity and firm entry/exit
save ".\BVD data for entry and exit analysis.dta", replace


* Set up accounts data for 2019 for benchmark to the DMP data
preserve
keep if year==2019
keep sales nemp regnum
save ".\2019 accounts data to compare with DMP.dta", replace
restore


* Set up productivity growth from accounts data to compare to DMP for the first year of pandemic (financial year 2020)
preserve
* Switch to financial years to align with pandemic 
replace year=year-1 if accmonth>0 & accmonth<4
keep if year==2020 
keep  dlnprod dlntfp2 ind2 regnum accmonth 
save ".\BVD Covid period productivity growth to merge.dta", replace
restore




************************************************************
* Calculate within firm productivity impacts from DMP data *
************************************************************


* Load DMP data
use ".\DMP data before anonymisation August 2016 to July 2022.dta", clear
* For SRS users use the following file (also need to replace regnum with id as missing from this dataset):
* use ".\Decision Maker Panel anonymised August 2016 to July 2022.dta", clear
* gen regnum=id


* Remove firms in panels with no relevant data (introductory panel and follow up phone waves)
drop if panel==4 | panel==5
xtset id wave

* Calculate quantitative measure of how Covid-19 has affected quantity of goods and services produced
* Average quality adjustment - assign midpoints to categorical responses
tab covid_goods
gen c19_qual_gds=20 if covid_goods==1
replace c19_qual_gds=5 if covid_goods==2
replace c19_qual_gds=0 if covid_goods==3
replace c19_qual_gds=-5 if covid_goods==4
replace c19_qual_gds=-20 if covid_goods==5
tab covid_services
gen c19_qual_ser=20 if covid_services==1
replace c19_qual_ser=5 if covid_services==2
replace c19_qual_ser=0 if covid_services==3
replace c19_qual_ser=-5 if covid_services==4
replace c19_qual_ser=-20 if covid_services==5
table covid_goods, c(mean c19_qual_gds)
table covid_services, c(mean c19_qual_ser)
* Have seperate questions for goods and services, so convert to one variable depending on sector
gen c19_qual=c19_qual_gds if ind2==1 | ind2==2                         					//Goods only for manaufacturing and other production
replace c19_qual=c19_qual_ser if (ind2==1 | ind2==2) & c19_qual==.
egen c19_qual_av=rmean( c19_qual_gds c19_qual_ser)
replace c19_qual=c19_qual_av
replace c19_qual=c19_qual_av if (ind2==3 | ind2==4 ) & c19_qual==.  					//Use average for construction, wholesale and retail //
replace c19_qual=c19_qual_ser if ind2==5 | (ind2>5 & ind2<15)                           //Services only for the rest
replace c19_qual=c19_qual_gds if (ind2==5 | (ind2>6 & ind2<15)) & c19_qual==.
su c19_qual_gds c19_qual_ser c19_qual
su c19_qual_gds c19_qual_ser c19_qual [aw=weight]                          
table ind2 [aw=weight], c(mean c19_qual_gds mean c19_qual_ser mean c19_qual)
* Generate quality adjustment industry average
gen c19_qual_ind=.
qui forvalues x = 1/14 { 
su c19_qual if ind2==`x' [aw=weight]  , d 
replace c19_qual_ind = r(mean) if ind2== `x' 
}
table ind2 [aw=weight], c(mean c19_qual mean c19_qual_ind)
* Use lagged quality data if missing
replace c19_qual=l.c19_qual if c19_qual==.

* Percentage of sales in 2019 that involved face to face contact with customers - variable across all waves
bysort id: egen f2f=mean(sales2019_f2f)

 
* Keep variables and time periods of interest
* This covers all data from and excludes data from the introductory panel of the survey (which is not used in this paper)
keep if wave>43 & panel<4 & wave<72
keep wave covid* ind2 id regnum emp_fur* weight f2f  emp_unable* dateq c19_qual c19_qual_ind size2
 
 
* Calculate average size group each firm is in over period of interest
bysort id: egen size=median(size2)
replace size=10 if size<250
drop size2

 
* Renaming cost variable to help loops
rename covid_cost_* covid_cost*
* For now, set impact of Covid-19 on average hours worked per employee to zero periods where don't have any data, this is imputed later on
gen covid_emphrs21q3=0   
gen covid_emphrs21q4=0
gen covid_emphrs22q1=0
gen covid_emphrs22q2=0
gen covid_emphrs22q3=0
gen covid_emphrs22q4=0
gen covid_emphrs22=0
gen covid_emphrs23=0
* Also covid costs for 2022 Q3 where don't have any data, just leave as blank variable and impute later
gen covid_cost22q3=.

* Balance panel
reshape wide covid* emp_fur* emp_unable* weight f2f dateq c19_qual size , i(id) j(wave)

* Most recent observation for each variable
foreach i in sales emp capex emphrs cost {
egen c19_`i'20q3=rfirst(covid_`i'20q351 covid_`i'20q350 covid_`i'20q349 covid_`i'20q348 covid_`i'20q347)
egen c19_`i'20q4=rfirst(covid_`i'20q453 covid_`i'20q452 covid_`i'20q451 covid_`i'20q450 covid_`i'20q449 covid_`i'20q448 covid_`i'20q447)
egen c19_`i'21q1=rfirst(covid_`i'21q156 covid_`i'21q155 covid_`i'21q154 covid_`i'21q153 covid_`i'21q152 covid_`i'21q151 covid_`i'21q150 covid_`i'21q149 covid_`i'21q148 covid_`i'21q147)
egen c19_`i'21q2=rfirst(covid_`i'21q259 covid_`i'21q258 covid_`i'21q257 covid_`i'21q256 covid_`i'21q255 covid_`i'21q254 covid_`i'21q253 covid_`i'21q252 covid_`i'21q251 covid_`i'21q250 covid_`i'21q249 covid_`i'21q248 covid_`i'21q247)
egen c19_`i'21q3=rfirst(covid_`i'21q362 covid_`i'21q361 covid_`i'21q360 covid_`i'21q359 covid_`i'21q358 covid_`i'21q357 covid_`i'21q356 covid_`i'21q355 covid_`i'21q354 covid_`i'21q353)
egen c19_`i'21q4=rfirst(covid_`i'21q465 covid_`i'21q464 covid_`i'21q463 covid_`i'21q462 covid_`i'21q461 covid_`i'21q460 covid_`i'21q459 covid_`i'21q458 covid_`i'21q457 )
egen c19_`i'22=rfirst(covid_`i'2262 covid_`i'2261 covid_`i'2260 covid_`i'2259 covid_`i'2258 covid_`i'2257 covid_`i'2256 covid_`i'2255 covid_`i'2254 covid_`i'2253 covid_`i'2252 covid_`i'2251)
egen c19_`i'22q1=rfirst(covid_`i'22q168 covid_`i'22q167 covid_`i'22q166 covid_`i'22q165 covid_`i'22q164 covid_`i'22q163)
egen c19_`i'22q2=rfirst(covid_`i'22q268 covid_`i'22q267 covid_`i'22q266 covid_`i'22q265 covid_`i'22q264 covid_`i'22q263)
egen c19_`i'22q3=rfirst(covid_`i'22q368 covid_`i'22q367 covid_`i'22q366)
egen c19_`i'23=rfirst(covid_`i'2368 covid_`i'2367 covid_`i'2366 covid_`i'2365 covid_`i'2364 covid_`i'2363)
}
egen c19_emphrs20q2=rfirst(covid_emphrs20q250 covid_emphrs20q249 covid_emphrs20q248)
reshape long


* Adjust 2020 Q2 data for when it was collected
* Big changes from one month to the next within this quarter (eg because of annoucement of Government supprt), so account for this in imputation
* Need to adjust data that was collected before the quarter ended
* Not adding any new data here, just adjusting the past to be more consistent with end quarter data
qui foreach i in sales20 emp20 capex20 {
* Regressions to generate average variables
reg covid_`i'q2 i.ind2 if wave==45 [aw=weight]
predict `i'_45
reg covid_`i'q2 i.ind2 if wave==46 [aw=weight]
predict `i'_46
reg covid_`i'q2 i.ind2 if wave==47 [aw=weight]
predict `i'_47
 * Generate combined covid impacts, add average difference to lagged observations
xtset id wave
gen c19_`i'q2=l2.covid_`i'q2 if wave==49 
replace c19_`i'q2=l3.covid_`i'q2+(`i'_47-`i'_46) if wave==49 & c19_`i'q2==.
replace c19_`i'q2=l4.covid_`i'q2+(`i'_47-`i'_45) if wave==49 & c19_`i'q2==.
}
* Only asked cost question once, so can't impute this
gen c19_cost20q2=covid_cost20q2


 
* Monthly employees on furlough data
* Data here refer to a specific month, but large changes in months within a quarter, so impute data for each month 
* Use industry average change relative to last firm data point to impute for missing months

* 2020 Q2
reg emp_furlough i.ind2 if wave==44 [aw=weight]
predict fur_44
reg emp_furlough i.ind2 if wave==45 [aw=weight]
predict fur_45
reg emp_furlough i.ind2 if wave==46 [aw=weight]
predict fur_46
* Furlough data for each month in Q2
* April
gen fur_apr=emp_furlough if wave==44
replace fur_apr=f.emp_furlough+fur_44-fur_45 if wave==44 & fur_apr==.
replace	fur_apr=f2.emp_furlough+fur_44-fur_46 if wave==44 & fur_apr==.
replace fur_apr=0 if fur_apr<0
replace fur_apr=100 if fur_apr>100 & fur_apr~=.
* May
gen fur_may=emp_furlough if wave==45
replace fur_may=l.emp_furlough+fur_45-fur_44 if wave==45 & fur_may==.
replace fur_may=f.emp_furlough+fur_45-fur_46 if wave==45 & fur_may==.
replace fur_may=0 if fur_may<0
replace fur_may=100 if fur_may>100 & fur_may~=.
* June
gen fur_jun=emp_furlough if wave==46
replace fur_jun=l.emp_furlough+fur_46-fur_45 if wave==46 & fur_jun==.
replace	fur_jun=l2.emp_furlough+fur_46-fur_44 if wave==46 & fur_jun==.
replace fur_jun=0 if fur_jun<0
replace fur_jun=100 if fur_jun>100 & fur_jun~=.

* 2020 Q3
reg emp_furlough i.ind2 if wave==47 [aw=weight]
predict fur_47
reg emp_furlough i.ind2 if wave==48 [aw=weight]
predict fur_48
reg emp_furlough i.ind2 if wave==49 [aw=weight]
predict fur_49
* July
gen fur_jul=emp_furlough if wave==47
replace fur_jul=f.emp_furlough+fur_47-fur_48 if wave==47 & fur_jul==.
replace	fur_jul=f2.emp_furlough+fur_47-fur_49 if wave==47 & fur_jul==.
replace fur_jul=0 if fur_jul<0
replace fur_jul=100 if fur_jul>100 & fur_jul~=.
* August
gen fur_aug=emp_furlough if wave==48
replace fur_aug=l.emp_furlough+fur_48-fur_47 if wave==48 & fur_aug==.
replace fur_aug=f.emp_furlough+fur_48-fur_49 if wave==48 & fur_aug==.
replace fur_aug=0 if fur_aug<0
replace fur_aug=100 if fur_aug>100 & fur_aug~=.
* September
gen fur_sep=emp_furlough if wave==49
replace fur_sep=l.emp_furlough+fur_49-fur_48 if wave==49 & fur_sep==.
replace	fur_sep=l2.emp_furlough+fur_49-fur_47 if wave==49 & fur_sep==.
replace fur_sep=0 if fur_sep<0
replace fur_sep=100 if fur_sep>100 & fur_sep~=.

* 2020 Q4
reg emp_furlough i.ind2 if wave==50 [aw=weight]
predict fur_50
reg emp_furlough i.ind2 if wave==51 [aw=weight]
predict fur_51
reg emp_furlough i.ind2 if wave==52 [aw=weight]
predict fur_52
* October
gen fur_oct=emp_furlough if wave==50
replace fur_oct=f.emp_furlough+fur_50-fur_51 if wave==50 & fur_oct==.
replace	fur_oct=f2.emp_furlough+fur_50-fur_52 if wave==50 & fur_oct==.
replace fur_oct=0 if fur_oct<0
replace fur_oct=100 if fur_oct>100 & fur_oct~=.
* November
gen fur_nov=emp_furlough if wave==51
replace fur_nov=l.emp_furlough+fur_51-fur_50  if wave==51 & fur_nov==.
replace fur_nov=f.emp_furlough+fur_51-fur_52 if wave==51 & fur_nov==.
replace fur_nov=0 if fur_nov<0
replace fur_nov=100 if fur_nov>100 & fur_nov~=.
* December
gen fur_dec=emp_furlough if wave==52
replace fur_dec=l.emp_furlough+fur_52-fur_51 if wave==52 & fur_dec==.
replace	fur_dec=l2.emp_furlough+fur_52-fur_50 if wave==52 & fur_dec==.
replace fur_dec=0 if fur_dec<0
replace fur_dec=100 if fur_dec>100 & fur_dec~=.

* 2021 Q1
reg emp_furlough i.ind2 if wave==53 [aw=weight]
predict fur_53
reg emp_furlough i.ind2 if wave==54 [aw=weight]
predict fur_54
reg emp_furlough i.ind2 if wave==55 [aw=weight]
predict fur_55
* January
gen fur_jan=emp_furlough if wave==53
replace fur_jan=f.emp_furlough+fur_53-fur_54 if wave==53 & fur_jan==.
replace	fur_jan=f2.emp_furlough+fur_53-fur_55 if wave==53 & fur_jan==.
replace fur_jan=0 if fur_jan<0
replace fur_jan=100 if fur_jan>100 & fur_jan~=.
* February
gen fur_feb=emp_furlough if wave==54
replace fur_feb=l.emp_furlough+fur_54-fur_53  if wave==54 & fur_feb==.
replace fur_feb=f.emp_furlough+fur_54-fur_55 if wave==54 & fur_feb==.
replace fur_feb=0 if fur_feb<0
replace fur_feb=100 if fur_feb>100 & fur_feb~=.
* March
gen fur_mar=emp_furlough if wave==55
replace fur_mar=l.emp_furlough+fur_55-fur_54 if wave==55 & fur_mar==.
replace	fur_mar=l2.emp_furlough+fur_55-fur_53 if wave==55 & fur_mar==.
replace fur_mar=0 if fur_mar<0
replace fur_mar=100 if fur_mar>100 & fur_mar~=.

* 2021 Q2
reg emp_furlough i.ind2 if wave==56 [aw=weight]
predict fur_56
reg emp_furlough i.ind2 if wave==57 [aw=weight]
predict fur_57
reg emp_furlough i.ind2 if wave==58 [aw=weight]
predict fur_58
* April
gen fur_apr21=emp_furlough if wave==56
replace fur_apr21=f.emp_furlough+fur_56-fur_57 if wave==56 & fur_apr21==.
replace	fur_apr21=f2.emp_furlough+fur_56-fur_58 if wave==56 & fur_apr21==.
replace fur_apr21=0 if fur_apr21<0
replace fur_apr21=100 if fur_apr21>100 & fur_apr21~=.
* May
gen fur_may21=emp_furlough if wave==57
replace fur_may21=l.emp_furlough+fur_57-fur_56  if wave==57 & fur_may21==.
replace fur_may21=f.emp_furlough+fur_57-fur_56 if wave==57 & fur_may21==.
replace fur_may21=0 if fur_may21<0
replace fur_may21=100 if fur_may21>100 & fur_may21~=.
* June
gen fur_jun21=emp_furlough if wave==58
replace fur_jun21=l.emp_furlough+fur_58-fur_57 if wave==58 & fur_jun21==.
replace	fur_jun21=l2.emp_furlough+fur_58-fur_56 if wave==58 & fur_jun21==.
replace fur_jun21=0 if fur_jun21<0
replace fur_jun21=100 if fur_jun21>100 & fur_jun21~=.

* For 2021 Q3, don't use monthly adjustment anymore as all low and close to 2% so makes no difference
gen c19_fur21q3=emp_furlough if wave==59
replace c19_fur21q3=emp_furlough if wave==60
replace c19_fur21q3=emp_furlough if wave==61

* Cross check fitted furlough data against actual
table wave , c(mean emp_furlough)
su fur_apr- fur_jun fur_jul- fur_sep fur_oct-fur_dec fur_jan-fur_mar fur_apr21-fur_jun21 c19_fur21q3, sep(20)


* Percentage of employees unable to work - Just using reported data here as not a key input into calcuations
gen c19_unable20q2=emp_unable if dateq==241
gen c19_unable20q3=emp_unable if dateq==242
gen c19_unable20q4=emp_unable if dateq==243
gen c19_unable21q1=emp_unable if dateq==244
gen c19_unable21q2=emp_unable if dateq==245
gen c19_unable21q3=emp_unable if dateq==246
gen c19_unable21q4=emp_unable if dateq==247
gen c19_unable22q1=emp_unable if dateq==248
gen c19_unable22q2=emp_unable if dateq==249
gen c19_unable22=emp_unable_22
gen c19_unable23=emp_unable_23


* Collapse down to one observation per firm so now a wide form dataset
collapse (mean) c19* fur_* covid_* ind2 id f2f size, by(regnum)


* Construct quarterly furlough data from monthly data - its set to zero from 2021 Q4 onwards as furlough scheme has ended then
gen c19_fur20q2=(fur_apr+fur_may+fur_jun)/3
gen c19_fur20q3=(fur_jul+fur_aug+fur_sep)/3
gen c19_fur20q4=(fur_oct+fur_nov+fur_dec)/3
gen c19_fur21q1=(fur_jan+fur_feb+fur_mar)/3
gen c19_fur21q2=(fur_apr21+fur_may21+fur_jun21)/3
gen c19_fur21q4=0
gen c19_fur22q1=0
gen c19_fur22q2=0
gen c19_fur22q3=0
gen c19_fur22q4=0
gen c19_fur23=0
tabstat c19_fur20q2 c19_fur20q3 c19_fur20q4 c19_fur21q1 c19_fur21q2 c19_fur21q3 c19_fur21q4, stats(mean n)
drop fur* 
* Generate furlough and unable to work data before wider imputation
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 23 {
gen covid_fur`i'=c19_fur`i'
gen covid_unable`i'=c19_unable`i'
}


* Impute other variables now - coverage in 2020 Q2 and Q3 is a bit lower than later periods so start with projecting back data for 2020 Q3 then Q2 
* Impute using firm level responses for adjoining quarters, and allow imputation coefficients to vary by industry


* 2020 Q3, impute data using 2020 Q4 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'20q3 c.c19_`i'20q4#i.ind2 i.ind2
predict `i'20q3_fit
replace c19_`i'20q3=`i'20q3_fit if c19_`i'20q3==.
}

* 2020 Q3, impute data using 2020 Q2 data if missing
qui foreach i in sales emp capex cost emphrs fur unable  {
* Regressions to generate average variables
reg covid_`i'20q3 c.c19_`i'20q2#i.ind2 i.ind2
predict `i'20q3_fitv2
replace c19_`i'20q3=`i'20q3_fitv2 if c19_`i'20q3==.
}

* 2020 Q3, impute data using 2021 Q1 data if missing - use two quarter imputation just for this quarter as this boosts sample size for furlough by about 500
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'20q3 c.c19_`i'21q1#i.ind2  i.ind2
predict `i'20q3_fitv3
replace c19_`i'20q3=`i'20q3_fitv3 if c19_`i'20q3==.
}


* 2020 Q2, impute data using 2020 Q3 data if missing
qui foreach i in sales emp capex cost emphrs fur unable  {
reg covid_`i'20q2 c.c19_`i'20q3#i.ind2 i.ind2
predict `i'20q2_fit
replace c19_`i'20q2=`i'20q2_fit if c19_`i'20q2==.
}


* Now impute forwards for 2020 Q4 onwards

* 2020 Q4, impute data using 2020 Q3 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'20q4 c.c19_`i'20q3#i.ind2 i.ind2
predict `i'20q4_fit
replace c19_`i'20q4=`i'20q4_fit if c19_`i'20q4==.
}

* 2020 Q4, impute data using 2021 Q1 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'20q4 c.c19_`i'21q1#i.ind2  i.ind2
predict `i'20q4_fitv2
replace c19_`i'20q4=`i'20q4_fitv2 if c19_`i'20q4==.
}


* 2021 Q1, impute data using 2020 Q4 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q1 c.c19_`i'20q4#i.ind2  i.ind2
predict `i'21q1_fit
replace c19_`i'21q1=`i'21q1_fit if c19_`i'21q1==.
}

* 2021 Q1, impute data using 2021 Q2 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q1 c.c19_`i'21q2#i.ind2 i.ind2
predict `i'21q1_fitv2
replace c19_`i'21q1=`i'21q1_fitv2 if c19_`i'21q1==.
}


* 2021 Q2, impute data using 2021 Q1 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q2 c.c19_`i'21q1#i.ind2 i.ind2
predict `i'21q2_fit
replace c19_`i'21q2=`i'21q2_fit if c19_`i'21q2==.
}

* 2021 Q2, impute data using 2021 Q3 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q2 c.c19_`i'21q3#i.ind2 i.ind2
predict `i'21q2_fitv2
replace c19_`i'21q2=`i'21q2_fitv2 if c19_`i'21q2==.
}


* 2021 Q3, impute data using 2021 Q2 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q3 c.c19_`i'21q2#i.ind2 i.ind2
predict `i'21q3_fit
replace c19_`i'21q3=`i'21q3_fit if c19_`i'21q3==.
}

* 2021 Q3, impute data using 2021 Q4 data if missing
qui foreach i in  sales emp capex cost emphrs fur unable {
reg covid_`i'21q3 c.c19_`i'21q4#i.ind2 i.ind2
predict `i'21q3_fitv2
replace c19_`i'21q3=`i'21q3_fitv2 if c19_`i'21q3==.
}


* 2021 Q4, impute data using 2021 Q3 data if missing 
qui foreach i in  sales emp capex cost unable {
reg covid_`i'21q4 c.c19_`i'21q3#i.ind2 i.ind2 
predict `i'21q4_fit
replace c19_`i'21q4=`i'21q4_fit if c19_`i'21q4==.
}

* 2021 Q4, impute data using 2022 Q1 data if missing 
qui foreach i in  sales emp capex cost unable {
reg covid_`i'21q4 c.c19_`i'22q1#i.ind2 i.ind2 
predict `i'21q4_fitv2
replace c19_`i'21q4=`i'21q4_fitv2 if c19_`i'21q4==.
}


* Impute 2022 Q1 data from 2021 Q4 data if missing 
* No need to impute backwards anymore due to way questions were asked (makes no changes)
* Plus also would't want firms with only 2022 data to be used anyway given that misses most of pandemic
qui foreach i in  sales emp capex cost unable {
reg covid_`i'22q1 c.c19_`i'21q4#i.ind2 i.ind2
predict `i'22q1_fit
replace c19_`i'22q1=`i'22q1_fit if c19_`i'22q1==.
}


* Impute 2022 Q2 data from 2022 Q1 data if missing 
qui foreach i in  sales emp capex cost unable {
reg covid_`i'22q2 c.c19_`i'22q1#i.ind2 i.ind2
predict `i'22q2_fit
replace c19_`i'22q2=`i'22q2_fit if c19_`i'22q2==.
}
 
 
* Impute 2022 Q3 data from 2022 Q2 data if missing 
qui foreach i in  sales emp capex  {
reg covid_`i'22q3 c.c19_`i'22q2#i.ind2 i.ind2
predict `i'22q3_fit
replace c19_`i'22q3=`i'22q3_fit if c19_`i'22q3==.
}


* Use 2022+ exepctations as 2023+ if missing 
qui foreach i in  sales emp capex cost emphrs unable {
replace c19_`i'23=c19_`i'22 if c19_`i'23==.
}
drop *22

* Impute 2023+ data from 2022 Q2 data if missing - no cost data for Q3 to use
qui foreach i in  sales emp capex cost unable {
reg covid_`i'23 c.c19_`i'22q2#i.ind2 i.ind2
predict `i'23_fit
replace c19_`i'23=`i'23_fit if c19_`i'23==.
}


* Interpolate for any data that is completely missing where questions not asked
* Costs and unable to work in 2023 Q3
gen c19_unable22q3=((c19_unable23-c19_unable22q2)/3)+c19_unable22q2
replace c19_cost22q3=((c19_cost23-c19_cost22q2)/3)+c19_cost22q2
* All covid impacts in 2022 Q4 - never asked about this period, interpolate between 2022 Q3 and 2023+
gen c19_sales22q4=0.5*(c19_sales22q3+c19_sales23)  
gen c19_emp22q4=0.5*(c19_emp22q3+c19_emp23)  
gen c19_capex22q4=0.5*(c19_capex22q3+c19_capex23)  
gen c19_cost22q4=0.5*(c19_cost22q3+c19_cost23)  
gen c19_unable22q4=0.5*(c19_unable22q3+c19_unable23)  


* Make sure unable to work and furlough shares are bounded at 0 to 100% after imputation
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
replace c19_fur`i'=0 if c19_fur`i'<0 
replace c19_fur`i'=100 if c19_fur`i'>100 & c19_fur`i'~=.
replace c19_unable`i'=0 if c19_unable`i'<0                  
replace c19_unable`i'=100 if c19_unable`i'>100 & c19_unable`i'~=.
}

* Impute quality adjustment where missing
replace c19_qual=c19_qual_ind if c19_qual==.  //Use full sample industry average if missing//



* Generate other derived variables for productivity analysis 


* Subtract long run average from each quarter to get Covid impact on % unable to work
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
replace c19_unable`i'=c19_unable`i'-c19_unable23
}


* Average hours hours worked per employee
* Use reported data where available, predict if missing using employment and sales impacts, don't include % unable to work because of risk of double counting 
qui foreach i in 20q2 20q3 20q4 21q1 21q2  {
reg c19_emphrs`i' c19_emp`i' c19_sales`i' c19_unable`i' i.ind2
predict c19_ahrs`i'
replace c19_ahrs`i'=c19_emphrs`i' if c19_emphrs`i'~=.
}
* For 2021 Q3 onwards, use % unable to work as the hours impact and assume that is the only way hours are reduced
qui foreach i in 21q3 21q4 22q1 22q2 22q3 22q4 23 {
gen c19_ahrs`i'=-1*c19_unable`i'
}


* Construct % of active employees, don't just add up because furloughed % is a percentage of people still employed
gen c19_emp20q2_a=-100*(1-(1+0.01*c19_emp20q2)*(1-0.01*c19_fur20q2))
replace c19_emp20q2_a=-100 if c19_emp20q2_a<-100
gen c19_emp20q3_a=-100*(1-(1+0.01*c19_emp20q3)*(1-0.01*c19_fur20q3))
replace c19_emp20q3_a=-100 if c19_emp20q3_a<-100
gen c19_emp20q4_a=-100*(1-(1+0.01*c19_emp20q4)*(1-0.01*c19_fur20q4))
replace c19_emp20q4_a=-100 if c19_emp20q4_a<-100
gen c19_emp21q1_a=-100*(1-(1+0.01*c19_emp21q1)*(1-0.01*c19_fur21q1))
replace c19_emp21q1_a=-100 if c19_emp21q1_a<-100
gen c19_emp21q2_a=-100*(1-(1+0.01*c19_emp21q2)*(1-0.01*c19_fur21q2))
replace c19_emp21q2_a=-100 if c19_emp21q2_a<-100
gen c19_emp21q3_a=-100*(1-(1+0.01*c19_emp21q3)*(1-0.01*c19_fur21q3))
replace c19_emp21q3_a=-100 if c19_emp21q3_a<-100
* After 2021 Q3 active employment equals total employment effect given furlough scheme closed
gen c19_emp21q4_a=c19_emp21q4
gen c19_emp22q1_a=c19_emp22q1
gen c19_emp22q2_a=c19_emp22q2
gen c19_emp22q3_a=c19_emp22q3
gen c19_emp22q4_a=c19_emp22q4
gen c19_emp23_a=c19_emp23


* Total hours worked
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
gen c19_thrs`i'=(100+c19_emp`i'_a)*(1+0.01*c19_ahrs`i')-100
* Clean to keep within -100% to +100% range
replace c19_thrs`i'=100 if c19_thrs`i'>100 & c19_thrs`i'~=.
replace c19_thrs`i'=-100 if c19_thrs`i'<-100 & c19_thrs`i'~=.
}


* Reapply industry labels
label val ind2 ind2


* Merge in ONS data on industry cost shares
merge m:m ind2 using ".\ONS industry cost shares.dta"
drop if _m==2
drop _m


* Winsorise productivity components before calculations
winsor2 c19_sales* c19_emp* c19_thrs* c19_capex* c19_cost*, replace cuts(5 95)


* Capital impact - assume annual investment is 6% of the capital stock and just assume 4% aggregate annual depreciation (so 2% annual capital growth in steady state)
* Divide these numbers by 4 for quarterly data, as quarterly investment is 1.5% of capital stock and depreciation is 1%
gen c19_k20q2=1.5*(0.01*c19_capex20q2)
gen c19_k20q3=1.5*(0.01*c19_capex20q3)+(1-0.01*1)*c19_k20q2
gen c19_k20q4=1.5*(0.01*c19_capex20q4)+(1-0.01*1)*c19_k20q3
gen c19_k21q1=1.5*(0.01*c19_capex21q1)+(1-0.01*1)*c19_k20q4
gen c19_k21q2=1.5*(0.01*c19_capex21q2)+(1-0.01*1)*c19_k21q1
gen c19_k21q3=1.5*(0.01*c19_capex21q3)+(1-0.01*1)*c19_k21q2
gen c19_k21q4=1.5*(0.01*c19_capex21q4)+(1-0.01*1)*c19_k21q3
gen c19_k22q1=1.5*(0.01*c19_capex22q1)+(1-0.01*1)*c19_k21q4
gen c19_k22q2=1.5*(0.01*c19_capex22q2)+(1-0.01*1)*c19_k22q1
gen c19_k22q3=1.5*(0.01*c19_capex22q3)+(1-0.01*1)*c19_k22q2
gen c19_k22q4=1.5*(0.01*c19_capex22q4)+(1-0.01*1)*c19_k22q3
* For 2023 ,just add 2023 investment impact, multiply everything by 4 so annual rather than quarterly
gen c19_k23=1.5*4*(0.01*c19_capex23)+(1-0.01*4)*c19_k22q4


* Other variables
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
* Covid impact on sales per employee
gen c19_rse`i'=c19_sales`i'-c19_emp`i'
* Sales per hour
gen c19_rshr`i'=c19_sales`i'-c19_thrs`i'
* Covid impact on GVA
gen c19_gva`i'=y_gva*(c19_sales`i'-m_y*100*(0.01*c19_sales`i'+0.01*c19_cost`i'+0.01*c19_sales`i'*0.01*c19_cost`i'))
* Covid impact on labour productivity per head
gen c19_lphd`i'=c19_gva`i'-c19_emp`i'
* Covid impact on labour productivity per hour
gen c19_lphr`i'=c19_gva`i'-c19_thrs`i' 
* Covid impact on TFP, using aggregate factor shares - See 'UK TFP data.xlsx' for original source
gen c19_tfp`i'=c19_gva`i'-(0.6294)*c19_thrs`i'-(1-0.6294)*c19_k`i'
}

* Productivity average variables
egen c19_lphr1=rmean(c19_lphr20q2 c19_lphr20q3 c19_lphr20q4 c19_lphr21q1)
egen c19_tfp1=rmean(c19_tfp20q2 c19_tfp20q3 c19_tfp20q4 c19_tfp21q1)
egen c19_thrs1=rmean(c19_thrs20q2 c19_thrs20q3 c19_thrs20q4 c19_thrs21q1)
egen c19_lphr2=rmean(c19_lphr21q2 c19_lphr21q3 c19_lphr21q4 c19_lphr22q1)
egen c19_tfp2=rmean(c19_tfp21q2 c19_tfp21q3 c19_tfp21q4 c19_tfp22q1)
egen c19_thrs2=rmean(c19_thrs21q2 c19_thrs21q3 c19_thrs21q4 c19_thrs22q1)
egen c19_cost1=rmean(c19_cost20q2 c19_cost20q3 c19_cost20q4 c19_cost21q1)
egen c19_cost2=rmean( c19_cost21q2 c19_cost21q3 c19_cost21q4 c19_cost22q1)
egen c19_sales=rmean(c19_sales20q2 c19_sales20q3 c19_sales20q4 c19_sales21q1 c19_sales21q2 c19_sales21q3 c19_sales21q4 c19_sales22q1)


* Keep only variables needed for paper
keep regnum id c19* ind2 id f2f y_gva m_y size
drop *_a c19_qual_ind c19_emphrs*
aorder
order regnum id ind2 id size f2f y_gva m_y  c19*

* Define labels for newly generated variables
order regnum id ind2 id size f2f c19_qual  y_gva m_y 
label var regnum "Companies House registered number"
label var id "Registered number (numeric version)"
label var ind2 "Industry group"
label var size "Size group for weighting (2 categories)"
label var f2f "Percentage of sales in 2019 involving face-to-face contact with customers"
label var c19_qual "Impact of Covid-19 on quality of goods services "
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4  {
label var c19_ahrs`i' `"Impact of Covid-19 on average hours per employee in 20`i'(%)"'
label var c19_capex`i' `"Impact of Covid-19 on investment in 20`i'(%)"'
label var c19_cost`i' `"Impact of Covid-19 on average unit costs in 20`i'(%)"'
label var c19_emp`i' `"Impact of Covid-19 on employment in 20`i'(%)"'
label var c19_fur`i' `"Percentage of employees on furlough in 20`i'(%)"'
label var c19_k`i' `"Impact of Covid-19 on capital in 20`i'(%)"'
label var c19_sales`i' `"Impact of Covid-19 on sales in 20`i'(%)"'
label var c19_thrs`i' `"Impact of Covid-19 on total hours worked in 20`i'(%)"'
label var c19_unable`i' `"Percentage of employees unable to work in 20`i'"'
label var c19_rse`i' `"Impact of Covid-19 on sales per employee in 20`i'(%)"'
label var c19_rshr`i' `"Impact of Covid-19 on sales per hour in 20`i'(%)"'
label var c19_gva`i' `"Impact of Covid-19 on GVA in 20`i'(%)"'
label var c19_lphd`i' `"Impact of Covid-19 on labour productivity per head in 20`i'(%)"'
label var c19_lphr`i' `"Impact of Covid-19 on labour productivity per hour in 20`i'(%)"'
label var c19_tfp`i' `"Impact of Covid-19 on TFP in 20`i'(%)"'
}
label var c19_ahrs23 "Impact of Covid-19 on average hours per employee in 2023+(%)"
label var c19_capex23 "Impact of Covid-19 on investment in 2023+(%)"
label var c19_cost23 "Impact of Covid-19 on average unit costs in 2023+(%)"
label var c19_emp23 "Impact of Covid-19 on employment in 2023+(%)"
label var c19_fur23 "Percentage of employees on furlough in 2023+(%)"
label var c19_k23 "Impact of Covid-19 on capital in 2023+(%)"
label var c19_sales23 "Impact of Covid-19 on sales in 2023+(%)"
label var c19_thrs23 "Impact of Covid-19 on total hours worked in 2023+(%)"
label var c19_unable23 "Percentage of employees unable to work in 2023+"
label var c19_rse23 "Impact of Covid-19 on sales per employee in 2023+(%)"
label var c19_rshr23 "Impact of Covid-19 on sales per hour in 2023+(%)"
label var c19_gva23 "Impact of Covid-19 on GVA in 2023+(%)"
label var c19_lphd23 "Impact of Covid-19 on labour productivity per head in 2023+(%)"
label var c19_lphr23 "Impact of Covid-19 on labour productivity per hour in 2023+(%)"
label var c19_tfp23 "Impact of Covid-19 on TFP in 2023+(%)"
label var c19_cost1 "Impact of Covid-19 on average unit costs in 1st year of pandemic(%)"
label var c19_cost2 "Impact of Covid-19 on average unit costs in 2nd year of pandemic(%)"
label var c19_thrs1 "Impact of Covid-19 on total hours worked in 1st year of pandemic(%)"
label var c19_thrs2 "Impact of Covid-19 on total hours worked in 2nd year of pandemic(%)"
label var c19_lphr1 "Impact of Covid-19 on labour productivity per hour in 1st year of pandemic(%)"
label var c19_lphr2 "Impact of Covid-19 on labour productivity per hour in 2nd year of pandemic(%)"
label var c19_tfp1 "Impact of Covid-19 on TFP in 1st year of pandemic(%)"
label var c19_tfp2 "Impact of Covid-19 on TFP in 2nd year of pandemic(%)"
label var c19_sales "Impact of Covid-19 on TFP in first two years of the pandemic(%)"

* Only keep observations that have tfp impacts
keep if c19_tfp20q2~=. & c19_tfp20q3~=. & c19_tfp20q4~=. & c19_tfp21q1~=. & c19_tfp21q2~=. & c19_tfp21q3~=. & c19_tfp21q4~=. & c19_tfp22q1~=. & c19_tfp22q2~=. & c19_tfp22q3~=. & c19_tfp22q4~=. & c19_tfp23~=.


* Save DMP data to merge
save ".\DMP data with Covid productivity impacts.dta", replace



* Merge different datasets together and create weights

* DMP data with productivity impacts
use ".\DMP data with Covid productivity impacts.dta", clear

* Merge in Dingle and Neiman teleworkability data
merge m:m ind2 using ".\Teleworking by industry.dta", nogen

* Merge in BVD accounts data
merge m:m regnum using  ".\BVD data to merge for Covid and productivity analysis.dta"
drop if _m==2
drop _m

* Merge in BVD data on productivity by industry 
merge m:m ind2 using ".\BVD pre Covid productivity by industry.dta", nogen


* Only keep observations with all relavent data - must have TFP impacts for all periods (and these firms will all have labour productivity data too)
keep if tfp2_last~=. & nemp_last~=. & c19_tfp20q2~=. & c19_tfp20q3~=. & c19_tfp20q4~=. & c19_tfp21q1~=. & c19_tfp21q2~=. & c19_tfp21q3~=. & c19_tfp21q4~=. & c19_tfp22q1~=. & c19_tfp22q2~=. & c19_tfp22q3~=. & c19_tfp22q4~=. & c19_tfp23~=.

* Generate weights
* These are employment based within industry and capped at 500, set minimum at 10
* Using employment data cleaned at 95th percentile, constain weights to match business register at 1 digit industry level for 2019
merge m:m ind2 using ".\2019 Business register weight shares industry only.dta", nogen
gen emp500=nemp_last
replace emp500=10 if nemp_last<10
replace emp500=500 if nemp_last>500 & nemp_last~=.
bysort ind2: egen emp_tot500=sum(emp500)  
gen weight=emp_share*emp500/emp_tot500  
label var weight "Employment weight for firms who have all data (max size = 500)"
label var emp_share "Industry employment share"
label var emp500 "Employment for weighting (capped at 500)

* Weight check
table ind2, c(sum weight)
tabstat weight, stats(Sum)
drop emp_tot500 

* Save data
save ".\Merged data for Covid and productivity analysis.dta", replace


* Set up industy level average Covid impact on sales data to merge for response analysis
preserve
collapse (mean) c19_sales [aw=weight], by(ind2)
save ".\Covid impacts by industry.dta", replace
restore




************
* Analysis *
************


*************************************
* DMP response rate analysis	    *
*************************************

* Note this version of the DMP data that includes non-respondents is not publically avaiable and it cannont be run from the SRS
* Load dataset
use "H:\DataPrivilege Secured Datasets\Decision Making Project\NEW FOLDER STRUCTURE\Data Analysis\# Data from Monthly Surveys Aggregated\Uncleaned DMP data 25 July 2022.dta", clear

* Figure A2: Active response rate
preserve
* Data prep
replace respond=100 if respond==1
keep if respond==0 | respond==100
drop if panel==5      					// These are data collected by phone outside of the normal survey round
keep respond regnum wave id ind2 
duplicates drop
bysort id wave: egen obs=count(respond)
tab obs
drop if obs==2
xtset id wave
qui forvalues i = 1/12 { 
gen respond_`i'=l`i'.respond 
}
* Define variable for having responded in the last month
egen active=rsum(respond respond_1 respond_2 respond_3 respond_4 respond_5 respond_6 respond_7 respond_8 respond_9 respond_10 respond_11 respond_12)
replace active=1 if active>1 & active~=.
tab active
* Active response rate data
table wave if respond>=0 & active==1, c(mean respond)
restore


* Table A1: Linear probability models for propensity to respond to the DMP

* Set up data
* Load BVD data
use ".\BVD data to merge for Covid and productivity analysis.dta", clear
* Expand the sample so have one observation per firm in the sampling frame for each DMP wave up to April 2022
keep lnprod_last ind2 regnum sample sic       
expand 68                                                     
bysort regnum: egen wave=seq(), from(1) to(69)	
egen id=group(regnum) 
* 3 digit SIC code
egen sic3=cut(sic), at(0(100)100000)
replace sic3=sic3/100
* Covid impact period dummy
gen cov=0 
replace cov=1 if wave>=43  
* Merge in whether respond from DMP data
merge m:m regnum wave using "H:\DataPrivilege Secured Datasets\Decision Making Project\NEW FOLDER STRUCTURE\Data Analysis\# Data from Monthly Surveys Aggregated\Uncleaned DMP data 25 July 2022.dta", keepusing(respond)
replace respond=0 if respond~=1
drop _m
* Merge in Covid impacts by industry
merge m:m ind2 using ".\Covid impacts by industry.dta"
 
* Table A1 regression table: 
* Allows time fixed effects to vary by when added  to sample frame as firms couldn't respond before being added to sample frame
reghdfe respond c19_sales if cov==1 & wave>28 & sample<5 & c19_sales~=. & wave<69, ab(wave#sample) cluster(id)
outreg2 using ".\Table A1.xls", bdec(3) sdec(3) replace
reghdfe respond c19_sales c.c19_sales#c.cov if wave>28 & sample<5 & lnprod_last~=.  & c19_sales~=.  & wave<69, ab(wave#sample sic3)  cluster(id)
outreg2 using ".\Table A1.xls", bdec(3) sdec(3) append
reghdfe respond lnprod_last c.lnprod_last#c.cov if wave>28 & sample<5 & lnprod_last~=.  & c19_sales~=.  & wave<69, ab(wave#sample sic3)  cluster(id)
outreg2 using ".\Table A1.xls", bdec(3) sdec(3) append



*************************************
* Figures based on full DMP dataset *
*************************************


* Load DMP data
use ".\DMP data before anonymisation August 2016 to July 2022.dta", clear
* For SRS users use the following file :
* use ".\Decision Maker Panel anonymised August 2016 to July 2022.dta", clear


* Figure 5 Panel A: CEO hours spent on Covid planning - Used for Figure 6
tab date if covidCEOhours_last6m~=.
tab covidCEOhours_last6m if covidCEOhours_next6m<9 [aw=weight]
tab covidCEOhours_next6m if covidCEOhours_next6m<9 [aw=weight]

* Figure 5 Panel B: Impact of Covid on types of investment 
* Set 2022+ data to missing if zero effect and no spending on that form of investment in 2019
replace exp_emp_22=. if training19==0 & exp_emp_22==0
replace exp_it_22=. if it19==0 & exp_it_22==0
replace exp_dev_22=. if rd19==0 & exp_dev_22==0
replace exp_equip_22=. if equip19==0 & exp_equip_22==0
replace exp_land_22=. if land19==0 & exp_land_22==0
tabstat  exp_emp_22 exp_it_22 exp_dev_22 exp_equip_22 exp_land_22 if wave>58 [aw=weight], stats(mean)
des exp_emp_22 exp_it_22 exp_dev_22 exp_equip_22 exp_land_22


* Figure 8 Panel A: Impact of Covid on quality of goods and services produced
tab covid_goods if covid_goods<6 [aw=weight]
tab covid_services if covid_services<6 [aw=weight]


* Figure A1 Panel A: % of employees on furlough
table dateq if emp_furlough~=. [aw=weight], c(mean emp_furlough)


* Figure A2: Number of responses per month
tab date


* Figure A3: DMP employment data versus company accounts
preserve
keep if year==2019
keep emp regnum
gen lnemp_dmp=ln(emp)
merge m:m regnum using ".\2019 accounts data to compare with DMP.dta", 
drop if _m==2
gen lnemp_bvd=ln(nemp)
* Drop firms with less than 10 employees
drop if nemp<10 | emp<10
* Charts for paper, binscatter then density function
binscatter  lnemp_dmp lnemp_bvd, ytitle("Log employment in DMP (2019)") xtitle("Log employment in accounts data (2019)") graphregion(color(white))
twoway kdensity lnemp_dmp,  lwidth(medthick) || kdensity lnemp_bvd, lwidth(medthick) legend(order(1 "DMP" 2 "Accounts data") cols(1) ring(0) position(2)) xtitle(Log employment in 2019) ytitle(Density) xlabel(2(2)12) xtick(2(2)12) graphregion(color(white)) 
restore

* Figure A3: DMP sales data versus company accounts
preserve
keep if qq==11 | qq==12 | qq==13 | qq==14    //Note, this is data that refers to 2019//
keep sales regnum
drop if sales==0
gen lnsales_dmp=ln(sales*4)                  //Annualize to make comparable to accounts data//
rename sales sales_dmp
merge m:m regnum using ".\2019 accounts data to compare with DMP.dta", 
drop if _m==2
gen lnsales_bvd=ln(sales)
* Drop firms with extreme sales for chart
drop if lnsales_bvd>20 & lnsales_bvd~=.
drop if lnsales_dmp>20 & lnsales_dmp~=.
* Charts for paper, binscatter then density function
binscatter  lnsales_dmp lnsales_bvd, ytitle("Log sales in DMP (2019)") xtitle("Log sales in accounts data (2019)") graphregion(color(white)) 
twoway kdensity lnsales_dmp,  lwidth(medthick) || kdensity lnsales_bvd, lwidth(medthick) legend(order(1 "DMP" 2 "Accounts data") cols(1) ring(0) position(2)) xtitle(Log sales in 2019) ytitle(Density) xlabel(2(2)20) xtick(2(2)20) graphregion(color(white)) 
restore


* Figure A14 Panel: Whether increase in unit costs is capacity or higher actual costs
tab capacityreduc_21q4 [aw=weight]



*************************************************
* Results based on combined and cleaned dataset *
*************************************************


* Load data
use ".\Merged data for Covid and productivity analysis.dta", clear


* Table 1: Covid impacts and exposure measures
* First generate dummy variables for having explanatory variables missing, to allow all observations to be used in regression
preserve
gen f2f_miss=0
replace f2f_miss=1 if f2f==.
replace f2f=9999 if f2f_miss==1
gen wfh19_miss=0
replace wfh19_miss=1 if wfh19==.
replace wfh19=9999 if wfh19_miss==1
gen online19_miss=0
replace online19_miss=1 if online19==.
replace online19=9999 if online19_miss==1
gen wagemiss=0
replace wagemiss=1 if lnwage==.
replace lnwage_last=9999 if wagemiss==1
* Estimates regressions for Covid impacts and exposure measures
reg c19_tfp1  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) replace
reg c19_tfp2  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) append
reg c19_tfp23  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) append
reg c19_thrs1  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) append
reg c19_thrs2  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) append
reg c19_thrs23  tele f2f lnwage_last f2f_miss wagemiss, robust
outreg2 using ".\Table 1.xls", bdec(3) sdec(3) append
restore


* Table 2: Impact of Covid-19 on hours worked and pre-Covid productivity regression table 
reg c19_thrs1 lnprod_last  [aw=weight]
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) replace
reghdfe c19_thrs1 lnprod_last  [aw=weight], ab(i.ind2) 
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reg c19_thrs2 lnprod_last  [aw=weight]
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reghdfe c19_thrs2 lnprod_last  [aw=weight], ab(i.ind2) 
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reg c19_thrs1 lntfp2_last  [aw=weight]
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reghdfe c19_thrs1 lntfp2_last  [aw=weight], ab(i.ind2) 
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reg c19_thrs2 lntfp2_last  [aw=weight]
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append
reghdfe c19_thrs2 lntfp2_last [aw=weight], ab(i.ind2) 
outreg2 using ".\Table 2.xls", bdec(3) sdec(3) append

* Summary stats for table footnote
su lnprod_last lntfp2_last [aw=weight]


* Figure 3: Impact of Covid-19 on businesses
* Sales
tabstat c19_sales20q2 c19_sales20q3 c19_sales20q4 c19_sales21q1 c19_sales21q2 c19_sales21q3 c19_sales21q4 c19_sales22q1 c19_sales22q2 c19_sales22q3 c19_sales22q4 c19_sales23 [aw=weight], stats(mean)
* Employment
tabstat c19_emp20q2 c19_emp20q3 c19_emp20q4 c19_emp21q1 c19_emp21q2 c19_emp21q3 c19_emp21q4 c19_emp22q1 c19_emp22q2 c19_emp22q3 c19_emp22q4 c19_emp23 [aw=weight], stats(mean)
* Total hours worked
tabstat c19_thrs20q2 c19_thrs20q3 c19_thrs20q4 c19_thrs21q1 c19_thrs21q2 c19_thrs21q3 c19_thrs21q4 c19_thrs22q1 c19_thrs22q2 c19_thrs22q3 c19_thrs22q4 c19_thrs23 [aw=weight], stats(mean)
* Unit costs
tabstat c19_cost20q2 c19_cost20q3 c19_cost20q4 c19_cost21q1 c19_cost21q2 c19_cost21q3 c19_cost21q4 c19_cost22q1 c19_cost22q2 c19_cost22q3 c19_cost22q4 c19_cost23 [aw=weight], stats(mean)
* Investment
tabstat c19_capex20q2 c19_capex20q3 c19_capex20q4 c19_capex21q1 c19_capex21q2 c19_capex21q3 c19_capex21q4 c19_capex22q1 c19_capex22q2 c19_capex22q3 c19_capex22q4 c19_capex23 [aw=weight], stats(mean)
* Capital 
tabstat c19_k20q2 c19_k20q3 c19_k20q4 c19_k21q1 c19_k21q2 c19_k21q3 c19_k21q4 c19_k22q1 c19_k22q2 c19_k22q3 c19_k22q4 c19_k23 [aw=weight], stats(mean)


* Data for Figure 4 and A9: Impact of Covid-19 on labour productivity per hour
preserve
* Calculate average aggregate productivity from latest accounts first
su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
* Calculate average industry productivity from latest accounts 
gen prod_ind=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight]  , d 
replace prod_ind = r(mean) if ind2== `x'
}
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Cost contributions to GVA
gen cost_con`i'=-y_gva*m_y*c19_cost`i'-y_gva*m_y*0.01*c19_sales`i'*c19_cost`i'

* Project forward firm level labour productivity
gen prod_`i'=prod_last*(1+0.01*c19_lphr`i')

* Contributions to productivity
gen prod_`i'_l=prod_last*(1-0.01*c19_thrs`i')
gen prod_`i'_y=prod_last*(1+0.01*c19_sales`i')
gen prod_`i'_m=prod_last*(1+0.01*cost_con`i')

* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)

* Mean productivity within industry
gen prod_ind_`i'=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight_`i']  , d 
replace prod_ind_`i' = r(mean) if ind2== `x' 
}

* Construct within and between effects
gen within_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'-prod_last))/prod_all
gen between_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last)))/prod_all
gen betind_`i'=100*((weight_`i'-weight)*(0.5*(prod_ind_`i'+prod_ind)))/prod_all
gen withind_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last-prod_ind_`i'-prod_ind)))/prod_all
gen between2_`i'=withind_`i'+betind_`i'
gen total_`i'=within_`i'+between_`i'
gen total2_`i'=100*(prod_all_`i'-prod_all)/prod_all

* Construct contributions
gen con_l_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_l-prod_last))/prod_all
gen con_y_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_y-prod_last))/prod_all
gen con_m_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_m-prod_last))/prod_all
gen within2_`i'=con_l_`i'+con_y_`i'+con_m_`i'
gen con_rse_`i'=con_l_`i'+con_y_`i'
gen con_gva_`i'=con_m_`i'+con_y_`i'
gen con_rs_`i'=con_y_`i'
}
* Summarise results - this is the data needed for Figure 4 and A9
* Variables in this table are as follows: Total impact; between firm impact; within firm impact; between 1 digit industry impact; within 1 digit industry impact; contribution of real sales per hour; contribution of intermediate costs; GVA contribution; labour input contribution; real sales contribution
foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
tabstat total_`i' between_`i' within_`i'  betind_`i' withind_`i' con_rse_`i' con_m_`i' con_gva_`i' con_l_`i' con_rs_`i', stats(sum)
}
restore


* Data for Figure 4 and A9: Impact of Covid-19 on TFP
preserve
replace prod_last=tfp2_last
* Calculate average aggregate productivity from latest accounts first
su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
* Calculate average industry productivity from latest accounts 
gen prod_ind=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight]  , d 
replace prod_ind = r(mean) if ind2== `x'
}
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Cost contributions to GVA
gen cost_con`i'=-y_gva*m_y*c19_cost`i'-y_gva*m_y*0.01*c19_sales`i'*c19_cost`i'

* Project forward firm level TFP
gen prod_`i'=prod_last*(1+0.01*c19_tfp`i')

* Contributions to TFP: capital and labour contributions are based on TFP factor weights
gen prod_`i'_l=prod_last*(1-(0.6291)*0.01*c19_thrs`i')
gen prod_`i'_y=prod_last*(1+0.01*c19_sales`i')
gen prod_`i'_m=prod_last*(1+0.01*cost_con`i')
gen prod_`i'_k=prod_last*(1-(0.3709)*0.01*c19_k`i')

* Project forward weights 
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)

* Mean productivity within industry
gen prod_ind_`i'=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight_`i']  , d 
replace prod_ind_`i' = r(mean) if ind2== `x' 
}

* Construct within and between effects
gen within_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'-prod_last))/prod_all
gen between_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last)))/prod_all
gen betind_`i'=100*((weight_`i'-weight)*(0.5*(prod_ind_`i'+prod_ind)))/prod_all
gen withind_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last-prod_ind_`i'-prod_ind)))/prod_all
gen between2_`i'=withind_`i'+betind_`i'
gen total_`i'=within_`i'+between_`i'
gen total2_`i'=100*(prod_all_`i'-prod_all)/prod_all

* Construct contributions
gen con_l_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_l-prod_last))/prod_all
gen con_y_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_y-prod_last))/prod_all
gen con_m_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_m-prod_last))/prod_all
gen con_k_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_k-prod_last))/prod_all
gen within2_`i'=con_l_`i'+con_y_`i'+con_m_`i'+prod_`i'_k
gen con_gva_`i'=con_m_`i'+con_y_`i'
gen con_rs_`i'=con_y_`i'
}
* Summarise results - this is the data needed for Figure 4 and A9
* Variables in this table are as follows: Total impact; between firm impact; within firm impact; between 1 digit industry impact; within 1 digit industry impact; GVA contribution; labour input contribution; capital input contribution; intermediate costs contrbution; real sales contribution
foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
tabstat total_`i' between_`i' within_`i'  betind_`i' withind_`i' con_gva_`i' con_l_`i' con_k_`i' con_m_`i' con_rs_`i', stats(sum)
}
restore


* Figure 6 Panel A: Within firm TFP kernel density 
preserve
replace c19_tfp1=. if c19_tfp1<-40 | c19_tfp1>40
replace c19_tfp2=. if c19_tfp2<-40 | c19_tfp2>40
* Colour version
twoway kdensity c19_tfp1,  lwidth(medthick) bwidth(1) || kdensity c19_tfp2,  lwidth(medthick)  bwidth(1) || kdensity c19_tfp23,  lwidth(medthick)  bwidth(1) legend(order(1 "1st year of pandemic (2020 Q2 to 2021 Q1)" 2 "2nd year of pandemic (2021 Q2 to 2022 Q1)" 3 "Medium term (2023+)") cols(1)  ) xtitle(Impact of Covid-19 on TFP) ytitle(Density) graphregion(color(white))  
* Greyscale version
twoway kdensity c19_tfp1,  lwidth(medthick) lcolor(black) bwidth(1) || kdensity c19_tfp2,  lwidth(medthick) lcolor(gs6) lpattern(dash) bwidth(1) || kdensity c19_tfp23,  lwidth(medthick) lcolor(gs10) lpattern(shortdash) bwidth(1) legend(order(1 "1st year of pandemic (2020 Q2 to 2021 Q1)" 2 "2nd year of pandemic (2021 Q2 to 2022 Q1)" 3 "Medium term (2023+)") cols(1)  ) xtitle(Impact of Covid-19 on TFP) ytitle(Density) graphregion(color(white))  
restore



* Figure 6 Panel B: Effects on TFP by industry in 2023+
preserve
 forvalues i = 1/14  {
* Rescale weights to equal zero
egen tot`i'=sum(weight) if ind2==`i'
replace weight=weight/tot`i' if ind2==`i'
qui su tfp2_last [aw=weight] if ind2==`i' , d 
gen tfp_all`i'=r(mean) if ind2==`i'
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last*(1+0.01*c19_tfp23) if ind2==`i'
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs23) if ind2==`i'
egen weight_`i'_sum=sum(weight_`i') if ind2==`i'
replace weight_`i'=weight_`i'/weight_`i'_sum if ind2==`i'
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight_`i']  if ind2==`i', d 
gen tfp_all_`i'=r(mean) if ind2==`i'
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all`i')/tfp_all`i' if ind2==`i'
}
* Results
su total_tfp*, sep(20)
tab ind2
restore


* Figure 8 Panel B: Effects on TFP by industry in 2023+ after quality adjustment
preserve
* Impacts by industry
 forvalues i = 1/14  {
* Rescale weights to equal zero
egen tot`i'=sum(weight) if ind2==`i'
replace weight=weight/tot`i' if ind2==`i'
qui su tfp2_last [aw=weight] if ind2==`i' , d 
gen tfp_all`i'=r(mean) if ind2==`i'
* Knock quality adjustment of within-firm TFP here
replace c19_tfp23=c19_tfp23+c19_qual  if ind2==`i'
* Project forward firm level productivity
gen tfp_`i'=tfp2_last*(1+0.01*c19_tfp23) if ind2==`i'
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs23) if ind2==`i'
egen weight_`i'_sum=sum(weight_`i') if ind2==`i'
replace weight_`i'=weight_`i'/weight_`i'_sum if ind2==`i'
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight_`i']  if ind2==`i', d 
gen tfp_all_`i'=r(mean) if ind2==`i'
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all`i')/tfp_all`i' if ind2==`i'
}
* Results
su total_tfp*, sep(20)
tab ind2
restore
* Aggregate impact
preserve 
egen tot=sum(weight) 
replace weight=weight/tot
qui su tfp2_last [aw=weight]  , d 
gen tfp_all=r(mean)
* Knock quality adjustment of within-firm TFP here
replace c19_tfp23=c19_tfp23+c19_qual 
* Project forward firm level productivity
gen tfp_adj=tfp2_last*(1+0.01*c19_tfp23) 
* Project forward weights
replace weight=weight*(1+0.01*c19_thrs23)
egen weight_sum=sum(weight) 
replace weight=weight/weight_sum 
* Calculate average aggregate productivity in each period
qui su tfp_adj [aw=weight]  , d 
gen tfp_all2=r(mean) 
gen total_tfp=100*(tfp_all2-tfp_all)/tfp_all
* Result
su total_tfp
restore
* Quality adjustment by industry cross check
table ind2 [aw=weight], c(mean c19_qual)


* Figure A6: BVD data for pre-Covid industry productivity in 2019
* Also used as an input for approximate replication files (BVD pre Covid productivity by industry.xlsx)
table ind2 [aw=weight], c(mean prod_last )
* By industry and size group for approximate replication
table ind2  [aw=weight], c(mean tfp2_last)


* Figure A7: Comaparing DMP productivity impacts vs accounting data
preserve
* Merge in latest accounting data
merge m:m regnum using  ".\BVD Covid period productivity growth to merge.dta"
drop if _m==2
* Align covid impact variables with accounting data
gen tfp_av=c19_tfp20q2/4 if (accmonth==4 | accmonth==5 | accmonth==6)
replace tfp_av=(c19_tfp20q2+c19_tfp20q3)/4 if (accmonth==7 | accmonth==8 | accmonth==9)
replace tfp_av=(c19_tfp20q2+c19_tfp20q3+c19_tfp20q4)/4 if (accmonth==10 | accmonth==11 | accmonth==12)
replace tfp_av=(c19_tfp20q2+c19_tfp20q3+c19_tfp20q4+c19_tfp21q1)/4 if (accmonth==1 | accmonth==2 | accmonth==3)
gen lphd_av=c19_lphd20q2/4 if (accmonth==4 | accmonth==5 | accmonth==6)
replace lphd_av=(c19_lphd20q2+c19_lphd20q3)/4 if (accmonth==7 | accmonth==8 | accmonth==9)
replace lphd_av=(c19_lphd20q2+c19_lphd20q3+c19_lphd20q4)/4 if (accmonth==10 | accmonth==11 | accmonth==12)
replace lphd_av=(c19_lphd20q2+c19_lphd20q3+c19_lphd20q4+c19_lphd21q1)/4 if (accmonth==1 | accmonth==2 | accmonth==3)
winsor2 dlntfp2 dlnprod, replace cuts(5 95)  
replace dlnprod=100*dlnprod
replace dlntfp2=100*dlntfp2
* Binscatters shown in the paper
binscatter tfp_av dlntfp2, xtitle("Change in TFP from accounting data, %") ytitle("Impact of Covid on TFP from DMP, %")
binscatter lphd_av dlnprod , xtitle("Change in labor productivity per worker from accounts, %") ytitle("Impact of Covid on labor prod. per worker from DMP, %") yscale(r(-30 -5)) ylabel(-30(5)-5)
restore


* Data for Figure A8: Impact of Covid-19 on labour productivity per head
preserve
* Calculate average aggregate productivity from latest accounts first
su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
* Calculate average industry productivity from latest accounts 
gen prod_ind=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight]  , d 
replace prod_ind = r(mean) if ind2== `x'
}
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Cost contributions to GVA
gen cost_con`i'=-y_gva*m_y*c19_cost`i'-y_gva*m_y*0.01*c19_sales`i'*c19_cost`i'

* Project forward firm level labour productivity
gen prod_`i'=prod_last*(1+0.01*c19_lphd`i')

* Contributions to productivity
gen prod_`i'_l=prod_last*(1-0.01*c19_emp`i')
gen prod_`i'_y=prod_last*(1+0.01*c19_sales`i')
gen prod_`i'_m=prod_last*(1+0.01*cost_con`i')

* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_emp`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)

* Mean productivity within industry
gen prod_ind_`i'=.
qui forvalues x = 1/14 { 
su prod_last if ind2==`x' [aw=weight_`i']  , d 
replace prod_ind_`i' = r(mean) if ind2== `x' 
}

* Construct within and between effects
gen within_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'-prod_last))/prod_all
gen between_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last)))/prod_all
gen betind_`i'=100*((weight_`i'-weight)*(0.5*(prod_ind_`i'+prod_ind)))/prod_all
gen withind_`i'=100*((weight_`i'-weight)*(0.5*(prod_`i'+prod_last-prod_ind_`i'-prod_ind)))/prod_all
gen between2_`i'=withind_`i'+betind_`i'
gen total_`i'=within_`i'+between_`i'
gen total2_`i'=100*(prod_all_`i'-prod_all)/prod_all

* Construct contributions
gen con_l_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_l-prod_last))/prod_all
gen con_y_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_y-prod_last))/prod_all
gen con_m_`i'=100*(0.5*(weight_`i'+weight)*(prod_`i'_m-prod_last))/prod_all
gen within2_`i'=con_l_`i'+con_y_`i'+con_m_`i'
gen con_rse_`i'=con_l_`i'+con_y_`i'
gen con_gva_`i'=con_m_`i'+con_y_`i'
}
* Summarise results - this is the data needed for Figure A8
* Variables in this table are as follows: Total impact; between firm impact; within firm impact; between 1 digit industry impact; within 1 digit industry impact; contribution of real sales per employee; contribution of intermediate costs; GVA contribution; labour input contribution
foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
tabstat total_`i' between_`i' within_`i'  betind_`i' withind_`i' con_rse_`i' con_m_`i' con_gva_`i' con_l_`i', stats(sum)
}
restore


* Figure A9: Contributions to impact of Covid-19 on within-firm productivity - see code above to generate data for Figure 4


* Figure A10: Hours impacts and pre-Covid productivity 
* Panel A: Labour productivity
binscatter c19_thrs1 lnprod_last , n(25) xscale(r(2 6)) xlabel(2(1)6)ytitle(Impact of Covid-19 on total hours worked in 1st year of pandemic) xtitle(Log labour productivity per job from accounts) ylab(,nogrid) graphregion(color(white)) bgcolor(white) savedata(".\Figure A10 labour productivity year 1 binscatter data")
binscatter c19_thrs2 lnprod_last , n(25) yscale(r(-60 0)) ylabel(-60(10)0) xscale(r(2 6)) xlabel(2(1)6)ytitle(Impact of Covid-19 on total hours worked in 2nd year of pandemic) xtitle(Log labour productivity per job from accounts) ylab(,nogrid) graphregion(color(white)) bgcolor(white) savedata(".\Figure A10 labour productivity year 2 binscatter data")
* Best fit lines
reg c19_thrs1 lnprod_last
reg c19_thrs2 lnprod_last 
* Panel B: TFP
binscatter c19_thrs1 lntfp2_last , n(25) ytitle(Impact of Covid-19 on total hours worked in 1st year of pandemic) xtitle(Log TFP from accounts) ylab(,nogrid) graphregion(color(white)) bgcolor(white) savedata(".\Figure A10 TFP year 1 binscatter data")
binscatter c19_thrs2 lntfp2_last , n(25) ytitle(Impact of Covid-19 on total hours worked in 2nd year of pandemic) xtitle(Log TFP from accounts) ylab(,nogrid) graphregion(color(white)) bgcolor(white) savedata(".\Figure A10 TFP year 2 binscatter data")
* Best fit lines
reg c19_thrs1 lntfp2_last
reg c19_thrs2 lntfp2_last 


* Figure A11: Hours worked and capital by industry
table ind2 [aw=weight], c(mean c19_thrs1 mean c19_thrs2)
table ind2 [aw=weight], c(mean c19_k21q1 mean c19_k22q1)

* Text fact, Q2 sales impact by industry
table ind2 [aw=weight], c(mean c19_sales20q2 )


* Figure A14 Panel B: Alternative TFP impact based on using unit costs for Covid 19 impacts
preserve
gen weight2=weight
qui su tfp2_last [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last*(1-0.01*c19_cost`i')                   //This is where alternative within firm TFP measure is switched in, costs have a negative in front as higher costs=lower tfp
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight2_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}
* Results
tabstat total_tfp*, stats(mean)
restore

* Figure A14 Panel B: Using industry level factor shares in TFP calculation instead of aggregate
* Firm level covid impact on TFP using industry level factor shares 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23 {
gen c19_tfpv3`i'=c19_gva`i'-(lab_weight)*c19_thrs`i'-(1-lab_weight)*c19_k`i'
}
preserve
* Aggregate TFP calculations
qui su tfp3_last [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp3_last*(1+0.01*c19_tfpv3`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}
* Results
tabstat total_tfp*, stats(mean)
restore


* Figure A13: Bootstrapping confidence intervals around main estimates
use ".\Merged data for Covid and productivity analysis.dta", clear

* Only keep variables needed for the boostrapping
keep id c19_thrs* c19_lphr* c19_tfp* ind2 prod_last tfp2_last weight* emp_share  emp500  
drop c19_tfp1 c19_tfp2 c19_lphr1 c19_lphr2 c19_thrs1 c19_thrs2
gen hasall=1

* Calculate mean impacts
preserve
* Labour productivity
* Calculate average aggregate productivity from latest accounts first
qui su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen prod_`i'=prod_last*(1+0.01*c19_lphr`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)
gen total_lp`i'=100*(prod_all_`i'-prod_all)/prod_all
}

* TFP
gen weight2=weight
qui su tfp2_last [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last*(1+0.01*c19_tfp`i')
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}

* Summarise results
tabstat total_lp*, stats(mean)
tabstat total_tfp*, stats(mean)
* Save results and create dataset with just the means in there
collapse (mean) total_lp* total_tfp*, by(hasall)
gen run=0
save ".\Bootstrapped sample.dta", replace
restore


* Loop to draw 1000 random samples and calculate confidence intervals
forvalues x = 1/1000 { 
preserve
* Draw random sample with replacement
set seed `x'
bsample 
* Recalculate weights so that they sum to one
drop weight
bysort ind2: egen emp_tot=sum(emp500)  
gen weight=emp_share*emp500/emp_tot 

* Labour productivity
* Calculate average aggregate productivity from latest accounts first
qui su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen prod_`i'=prod_last*(1+0.01*c19_lphr`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)
gen total_lp`i'=100*(prod_all_`i'-prod_all)/prod_all
}

* TFP
gen weight2=weight
qui su tfp2_last [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last*(1+0.01*c19_tfp`i')
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}

* Save results and append each replication to the data file
collapse (mean) total_*, by(hasall)
gen run=`x'
append using ".\Bootstrapped sample.dta"
save ".\Bootstrapped sample.dta", replace
restore
}


use ".\Bootstrapped sample.dta", clear

* Make time numeric so that reshape command works
rename *20q2 *1
rename *20q3 *2
rename *20q4 *3
rename *21q1 *4
rename *21q2 *5
rename *21q3 *6
rename *21q4 *7
rename *22q1 *8
rename *22q2 *9
rename *22q3 *10
rename *22q4 *11
rename *23 *12

* Reshape to long form
reshape long total_lp total_tfp, i(run) j(time)

* Generate 2.5 and 97.5 percentiles
gen total_lp_95l=.
gen total_lp_95h=.
gen total_tfp_95l=.
gen total_tfp_95h=.
qui forvalues i = 1(1)12 {
_pctile total_lp if time==`i' & run~=0, percentile(2.5 97.5)
replace total_lp_95l=r(r1) if time==`i'
replace total_lp_95h=r(r2) if time==`i'
_pctile total_tfp if time==`i' & run~=0, percentile(2.5 97.5)
replace total_tfp_95l=r(r1) if time==`i'
replace total_tfp_95h=r(r2) if time==`i'
}

* Labour productivity
* 90% and 95% confidence intervals + simulated mean
table time if run~=0, c(p5 total_lp p95 total_lp mean total_lp_95l mean total_lp_95h mean total_lp)
* Actual mean 
table time if run==0, c(mean total_lp)

* TFP
* 90% and 95% confidence intervals + simulated mean
table time if run~=0, c(p5 total_tfp p95 total_tfp mean total_tfp_95l mean total_tfp_95h mean total_tfp)
* Actual mean 
table time if run==0, c(mean total_tfp)



***********************************
* Results based on just BVD data  *
***********************************

* Load data
use ".\BVD data for entry and exit analysis.dta", clear

* Set up employment weight that is capped at 500, set a minimum to 10
gen emp_weight=nemp
replace emp_weight=500 if nemp>500 & nemp~=.
replace emp_weight=10 if nemp<10

* Table A3: Firm entry/exit and productivity
* New firms entering (first year of accounts if 2016 to 2019, that is estimation period)
reghdfe lnprod new if year>2015 & year<2022  & fail~=. & new~=. [aw=emp_weight], ab(year) cluster(id)
outreg2 using ".\Table A3.xls", bdec(3) sdec(3) replace
reghdfe lnprod new yr2 yr3 if year>2015 & year<2022  & fail~=. & new~=. [aw=emp_weight], ab(year) cluster(id)
outreg2 using ".\Table A3.xls", bdec(3) sdec(3) append
* Firms exiting
reghdfe lnprod fail if year>2015 & year<2022  & fail~=. & new~=. [aw=emp_weight], ab(year) cluster(id)
outreg2 using ".\Table A3.xls", bdec(3) sdec(3) append
reghdfe lnprod c.fail#i.cov_fail if year>2015 & year<2022  & fail~=. & new~=. [aw=emp_weight], ab(year) cluster(id)
outreg2 using ".\Table A3.xls", bdec(3) sdec(3) append
test c.fail#0.cov_fail=c.fail#1.cov_fail                    //Test that coefficient is the same pre and during the pandemic




*************************************************************
* Approximate results that don't involve and BVD microdata  *
*************************************************************


* Code to run with full dataset to derive equation to impute productivity - this bit cannot be run from the SRS dataset because it doesn't contain BVD pre-covid productivity
* Load data
use ".\Merged data for Covid and productivity analysis.dta", clear

* Code to impue firm-level productivity using industry and impact on hours worked
* Labour productivity
reg prod_last c19_thrs20q2 c19_thrs20q3 c19_thrs20q4 c19_thrs21q1 c19_thrs21q2 c19_thrs21q3 c19_thrs21q4 c19_thrs22q1 c19_thrs22q2 c19_thrs22q3 c19_thrs22q4 c19_thrs23 i.ind2 [aw=weight]
outreg2 using ".\Pre covid productivity imputation regressions.xls", bdec(3) sdec(3) replace
* TFP
reg tfp2_last c19_thrs20q2 c19_thrs20q3 c19_thrs20q4 c19_thrs21q1 c19_thrs21q2 c19_thrs21q3 c19_thrs21q4 c19_thrs22q1 c19_thrs22q2 c19_thrs22q3 c19_thrs22q4 c19_thrs23 i.ind2 [aw=weight]
outreg2 using ".\Pre covid productivity imputation regressions.xls", bdec(3) sdec(3) append
predict tfp_fit2
* Coefficients from this equation are saved in ".\Coefficients for productivity imputation.dta"


* Summary of main results using actual paper data
preserve
* Hourly labour productivity calcuations
* Calculate average aggregate productivity from latest accounts first
qui su prod_last [aw=weight]  , d 
gen prod_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen prod_`i'=prod_last*(1+0.01*c19_lphr`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)
gen total_lp`i'=100*(prod_all_`i'-prod_all)/prod_all
}
* TFP calculations
gen weight2=weight
qui su tfp2_last [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last*(1+0.01*c19_tfp`i')
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight2_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}
* Results - Labour productivity first, then TFP
tabstat total_lp*, stats(mean)
tabstat total_tfp*, stats(mean)
count
restore





* Code to approximately replicate main results without needing BVD firm level data - this can be run using DMP data in the SRS

* DMP data with productivity impacts
use ".\DMP data with Covid productivity impacts.dta", clear

* Merge in BVD data on productivity by industry 
merge m:m ind2 using ".\BVD pre Covid productivity by industry.dta", nogen


* Impute productivity using data from above
* Merge in coefficients for productivity imputation
merge m:m ind2 using ".\Coefficients for productivity imputation.dta", nogen
gen prod_fit=c+a1+b1*c19_thrs20q2+b2*c19_thrs20q3+b3*c19_thrs20q4+b4*c19_thrs21q1+b5*c19_thrs21q2+b6*c19_thrs21q3+b7*c19_thrs21q4+b8*c19_thrs22q1+b9*c19_thrs22q2+b10*c19_thrs22q3+b11*c19_thrs22q4+b12*c19_thrs23
gen tfp_fit=C+A1+B1*c19_thrs20q2+B2*c19_thrs20q3+B3*c19_thrs20q4+B4*c19_thrs21q1+B5*c19_thrs21q2+B6*c19_thrs21q3+B7*c19_thrs21q4+B8*c19_thrs22q1+B9*c19_thrs22q2+B10*c19_thrs22q3+B11*c19_thrs22q4+B12*c19_thrs23


* Generate weights based on industry and firm size group to match 2019 business register, as you can calculate from SRS version (slighlty different weight calculation here as don't have actual employment levels from BVD in SRS)
egen wgrp=group(ind2 size) 
merge m:m wgrp using ".\2019 Business register weight shares with size and industry.dta", nogen
bysort wgrp: egen wgrp_count=count(wgrp) 
gen weight=wgrp_share/wgrp_count
label var weight "Weight"

* Weight check
table ind2, c(sum weight)
table size, c(sum weight)
table ind2 size, c(sum weight)
tabstat weight, stats(Sum)
drop wgrp wgrp_share wgrp_count



* Code to approximatley replicate main results using just BVD industry productivity data - this will exclude within industry reallocation effects
preserve
* Hourly labour productivity calcuations
* Calculate average aggregate productivity from latest accounts first
qui su prod_last_ind [aw=weight]  , d 
gen prod_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen prod_`i'=prod_last_ind*(1+0.01*c19_lphr`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)
gen total_lp`i'=100*(prod_all_`i'-prod_all)/prod_all
}
tabstat total_lp*, stats(mean)

* TFP calculations
gen weight2=weight
qui su tfp2_last_ind [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp2_last_ind*(1+0.01*c19_tfp`i')
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight2_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}
* Results
tabstat total_lp*, stats(mean)
tabstat total_tfp*, stats(mean)
restore
count



* Code to approximatley replicate main results using imputed productivity data
preserve
* Hourly labour productivity calcuations
* Calculate average aggregate productivity from latest accounts first
replace prod_last=prod_fit
qui su prod_last_ind [aw=weight]  , d 
gen prod_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen prod_`i'=prod_fit*(1+0.01*c19_lphr`i')
* Project forward weights
gen weight_`i'=weight*(1+0.01*c19_thrs`i')
egen weight_`i'_sum=sum(weight_`i')
replace weight_`i'=weight_`i'/weight_`i'_sum
* Calculate average aggregate productivity in each period
qui su prod_`i' [aw=weight_`i']  , d 
gen prod_all_`i'=r(mean)
gen total_lp`i'=100*(prod_all_`i'-prod_all)/prod_all
}
tabstat total_lp*, stats(mean)
* TFP calculations
gen weight2=weight
qui su tfp_fit [aw=weight]  , d 
gen tfp_all=r(mean) 
qui foreach i in 20q2 20q3 20q4 21q1 21q2 21q3 21q4 22q1 22q2 22q3 22q4 23  {
* Project forward firm level labour productivity
gen tfp_`i'=tfp_fit*(1+0.01*c19_tfp`i')
* Project forward weights
gen weight2_`i'=weight2*(1+0.01*c19_thrs`i')
egen weight2_`i'_sum=sum(weight2_`i')
replace weight2_`i'=weight2_`i'/weight2_`i'_sum
* Calculate average aggregate productivity in each period
qui su tfp_`i' [aw=weight2_`i']  , d 
gen tfp_all_`i'=r(mean)
gen total_tfp`i'=100*(tfp_all_`i'-tfp_all)/tfp_all
}
* Results
tabstat total_lp*, stats(mean)
tabstat total_tfp*, stats(mean)
restore
count



* Close log file
log close














